#!/usr/bin/env python
# encoding=utf-8
import re
from xtls.codehelper import timeit
try:
    from cStringIO import StringIO
except ImportError:
    from StringIO import StringIO

from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage


@timeit
def convert(fp):
    rsrcmgr = PDFResourceManager()
    retstr = StringIO()
    device = TextConverter(rsrcmgr, retstr, laparams=LAParams())
    interpreter = PDFPageInterpreter(rsrcmgr, device)

    for page in PDFPage.get_pages(fp, set()):
        interpreter.process_page(page)

    text = retstr.getvalue()

    device.close()
    retstr.close()
    return re.sub(ur'[\n]+', '\n', text.decode('utf-8'))


if __name__ == '__main__':
    # with open('/home/xlzd/abc.pdf', 'rb') as fp:
    #     fp = StringIO(fp.read())
    # # StringIO()
    # print len(convert(fp))
    import requests
    url = 'http://7xqxbi.com2.z0.glb.qiniucdn.com/98c380409a3a88a99cf9d2e052cce8ae74b971430ade224443c77cdb9875c6c1.pdf'
    data = requests.get(url, headers={
        'Host': 'www.cninfo.com.cn',
        'Upgrade-Insecure-Requests': 1,
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36'
    }).content
    fp = StringIO(data)
    print convert(fp)
